** Replication Code for data preparation for "Triangulating the relationship between education and attitudes towards immigration"
** Qinya Feng
** Journal of Experimental Political Science

/* 
NOTE:
This do file constructs the main dataset, EduIA.dta, used for all three studies.
It contains five sections. The first three sections collect and prepare variables used in all three studies. 
Section 4 prepares education reform variables used in Study 2; it also prepares the municiality dataset, Kommun.dta, used in the Appendix analysis for Study 2.
Section 5 prepares PGI and related variales in Study 3.

Sections:
1. Basic demographic variable for STR twins
2. Immigration attitudes
3. Educational attainment
4. Education reform variables for Study 2
5. PGI and other variables for Study 3
*/


clear 
cd "H:\Dofiles\Edu_IA"

set scheme s1mono




*****************************************************
**** 1. Basic demographic variable for STR twins ****
*****************************************************

use D:\STR_RiskScores\Stata_new\STR\cohort_combined.dta, clear
* Keep actual twin pairs
bysort LopNrParID: gen n=[_N]
keep if n==2
drop n
label variable LopNrParID "Twin pair ID"

* Zygosity
gen Relatedness=.
replace Relatedness=0.5 if BESTZYG==2 | BESTZYG==4
replace Relatedness=1 if BESTZYG==1
label variable Relatedness "Coefficient of relatedness in twin pairs"
keep if Relatedness!=.
gen Zyg=3-(2*Relatedness) 
label variable Zyg "Zygosity" //Zyg = 1 for MZ twins; Zyg = 2 for DZ twins

* Other adjustments to sex and birth year
replace SEX=SEX-1
rename SEX female
label variable female "Female" //Sex = 1 for female; Sex = 0 for male
destring Byear, replace
gen BirthYear=floor(Byear/100)
label variable BirthYear "Birth year"
drop Byear
gen age_2009=2009-BirthYear

* Assign number in twin pairs(by the twin pair ID LopNrParID)
sort LopNrParID LopNr
by LopNrParID: gen TwinNr=[_n]
label variable TwinNr "Twin nr (first used)"
save EduIA.dta, replace

**Birth municipality (in 1960)
use D:\STR_RiskScores\Stata_new\FoB\FoB60.dta //FoB60 is the earliest year for available census
keep LopNr Kommun
qui: gen n=.
qui: bysort LopNr: replace n=[_n]
qui: keep if n==1
qui: drop n
rename Kommun old_BirthKommun 
merge 1:1 LopNr using EduIA.dta, keep(match using) nogen 
save EduIA.dta, replace

* Residence municipality in 1965 
use D:\STR_RiskScores\Stata_new\FoB\FoB65.dta
keep LopNr Kommun
qui: gen n=.
qui: bysort LopNr: replace n=[_n]
qui: keep if n==1
qui: drop n
rename Kommun kod65
merge 1:1 LopNr using EduIA.dta, keep(match using) nogen
rename kod65 old_Kommun65 
recast int old_BirthKommun
recast int old_Kommun65

label variable old_BirthKommun "Birth municipality (1960)"
label variable old_Kommun65 "Residence municipality (1965)"

save EduIA.dta, replace

* Parish in 1960 
//Parish is used in the education reform section to exclude individuals who were in parishes where information on the reform lacks (as the exact reform timings are not available for some parish)
use D:\STR_RiskScores\Stata_new\FoB\FoB60.dta
keep LopNr Forsamling
qui: gen n=.
qui: bysort LopNr: replace n=[_n]
qui: keep if n==1
qui: drop n
rename Forsamling Fors60
merge 1:1 LopNr using EduIA.dta, keep(match using) nogen 
save EduIA.dta, replace





**********************************
**** 2. Immigration attitudes ****
**********************************

use D:\STR_RiskScores\Stata_new\STRSalty\slty_map
keep LopNr ATTITYD33_23-ATTITYD33_24 ATTITYD33_26-ATTITYD33_27

foreach var of varlist ATTITYD33_23-ATTITYD33_24 ATTITYD33_26-ATTITYD33_27 {
qui: gen n=.
qui: bysort LopNr: replace n=[_n]
qui: keep if n==1
qui: drop n
qui: merge 1:1 LopNr using EduIA.dta, keep(match using) nogen
qui: keep if LopNrParID!=.
}

rename ATTITYD33_23 labor_immigration
rename ATTITYD33_24 language_citizenship
rename ATTITYD33_26 fewer_refugees
rename ATTITYD33_27 support_culture

* Reverse code the refugee acceptace and language test items
foreach var of varlist labor_immigration-support_culture{
  replace `var' =. if `var' == 9
}
gen refugee_r = 6 - fewer_refugees
gen language_r = 6 - language_citizenship


* Generate immigration attitude index by averaging responses
gen rawim_avg =.
replace rawim_avg = (refugee_r + labor_immigration + language_r + support_culture)/4
egen im_avg = std(rawim_avg)

* Generate latent variable for immigration attitude based on graded response model (irt grm is an extension of the 2PL model to ordered categorical items) 
irt grm refugee_r labor_immigration support_culture language_r if im_avg!=. //Main reason for graded response model: it assumes discrete responses rather than continuous response as in factor analysis
estat report, sort(b) //Report estimated IRT parameters, here the Difficulty paramter b is reported and the items are sorted according to this parameter

predict im_irt if im_avg!=., latent se(se_imirt) //Use default method empirical Bayes means to predict the latent variable
qui: egen im_irt_mean = mean(im_irt)
qui: egen im_irt_sd = sd(im_irt)
qui: replace im_irt = (im_irt - im_irt_mean) / im_irt_sd
qui: drop im_irt_mean im_irt_sd

* The following ICCgraph correspondes to Figure A.2 in the Appendix
capture graph drop refugee_r labor_immigration language_r support_culture
set graphics off
irtgraph icc refugee_r, blocation title(,) xlabel(, labsize(small)) legend(symxsize(5) size(vsmall)) name(refugee) //blocation add vertical lines for estimated item difficulties
irtgraph icc labor_immigration, blocation title(,) xlabel(, labsize(small)) legend(symxsize(5) size(vsmall)) name(labor)
irtgraph icc language_r, blocation title(,) xlabel(, labsize(small)) legend(symxsize(5) size(vsmall)) name(language)
irtgraph icc support_culture, blocation title(,) xlabel(, labsize(small)) legend(symxsize(5) size(vsmall)) name(culture)
//set graphics on 
graph combine refugee labor language culture, ///
title("ICC graphs for immigration policy items", size(medium))
graph export "Results\FigureA2.png", as(png) replace

* Label variables
label variable labor_immigration "Increase labor immigraion"
label variable language_citizenship "Introduce a language test"
label variable fewer_refugees "Accept fewer refugees"
label variable support_culture "Economic support for immigrant culture"
label variable refugee_r "Fewer refugees (reversed)"
label variable language_r "Language test (reversed)"
label variable im_avg "Immigration policy attitude (Avg)"
label variable im_irt "Immigration policy attitude (IRT)"

save EduIA.dta, replace






***********************************
**** 3. Educational attainment ****
***********************************

use D:\STR_RiskScores\Stata_new\LISA\LISA_2009 //Highest level of education obtained, matched from LISA register 2009.
keep LopNr Sun2000niva Sun2000Inr
foreach var of varlist Sun2000niva Sun2000Inr{
gen n=.
bysort LopNr: replace n=[_n]
keep if n==1
drop n
merge 1:1 LopNr using EduIA.dta, keep(match using) nogen 
keep if LopNrParID!=.
}


* Years of education based on the highest degree obtained
gen education_year=.
replace education_year=7 if Sun2000niva < 200 & Sun2000niva !=.
replace education_year=9 if inrange(Sun2000niva, 200, 206)
replace education_year=9.5 if Sun2000niva==204 
replace education_year=10 if inrange(Sun2000niva, 310, 319)
replace education_year=11 if inrange(Sun2000niva, 320, 329)
replace education_year=12 if inrange(Sun2000niva, 330, 339)
replace education_year=13 if inrange(Sun2000niva, 410, 419)
replace education_year=14 if inrange(Sun2000niva, 520, 529)
replace education_year=15 if inrange(Sun2000niva, 530, 539)
replace education_year=16 if inrange(Sun2000niva, 540, 549)
replace education_year=17 if inrange(Sun2000niva, 550, 559)
replace education_year=18 if inrange(Sun2000niva, 600, 629)
replace education_year=20 if inrange(Sun2000niva, 640, 649)

* Tertiary education
gen tertiary_edu =.
replace tertiary_edu = 1 if Sun2000niva > 529 & Sun2000niva != 999 & Sun2000niva !=.
replace tertiary_edu = 0 if Sun2000niva < 530 & Sun2000niva !=.

** Education orientation(Orientation of the hightest degree)
gen Sun2000Inr1="*"
replace Sun2000Inr1=substr(Sun2000Inr,1,1) //Keep the first digit of Sun2000Inr1
destring Sun2000Inr1, replace
gen edu_orientation=.

replace edu_orientation=1 if Sun2000Inr1==0 //Allmän utbildning
replace edu_orientation=2 if Sun2000Inr1==1 //Pedagogik och lärarutbildning
replace edu_orientation=3 if Sun2000Inr1==2 //Humaniora och konst
replace edu_orientation=4 if Sun2000Inr1==3 //Samhällsvetenskap, juridik, handel, administration
replace edu_orientation=5 if Sun2000Inr1==4 //Naturvetenskap, matematik och data
replace edu_orientation=6 if Sun2000Inr1==5 //Teknik och tillverkning
replace edu_orientation=7 if Sun2000Inr1==6 //Lant- och skogsbruk samt djursjukvård
replace edu_orientation=8 if Sun2000Inr1==7 //Hälso- och sjukvård samt social omsorg
replace edu_orientation=9 if Sun2000Inr1==8 //Tjänster
replace edu_orientation=10 if Sun2000Inr1==9 //Okänd

save EduIA.dta, replace

label variable education_year "Years of Education"
label variable tertiary_edu "Tertiary Education"
label variable edu_orientation "Education orientation"

label define tertiary_edu 0 "Non-tertiary education" 1 "Tertiary education"
label values tertiary_edu tertiary_edu

label define edu_orientation 1 "General education" 2 "Pedagogy and teaching" 3 "Humanities and art" 4 "Social sciences Law Business Administration" 5 "Science Mathematics Data" 6 "Technology and Manufacturing" 7 "Agriculture Forestry Veterinary" 8 "Health care and Social care" 9 "Services" 10 "Unknown", replace
label values edu_orientation edu_orientation

save EduIA.dta, replace





*************************************************
**** 4. Education reform variables for Study 2 **
*************************************************

** 4.1 Reform_indicator function
//The following reform_indicator function is adapted based on Lindgren et al. (2017)
//The reform_indicator function determines whether an individual was exposed to the reform
capture program drop reform_indicator
program define reform_indicator
capture drop reformKommun60
capture drop treatment
capture drop firstcohort60

* For individuals born after 1950, the most proximate year for residence munipality during the reform affected years is 1965 (i.e., the 1950 cohort were about 15 yrs old);
gen reformKommun60 = .
replace reformKommun60 = old_BirthKommun //inhabited municipality from FoB60
replace reformKommun60 = old_Kommun65 if BirthYear >= 1950 & BirthYear !=. //inhabited municipality from FoB65
recast int reformKommun60

save EduIA.dta, replace

* Get Holmlund's table of municipality treatments (Holmlund, 2007)
insheet using "D:\Data\ExtData\SkolData\skd-slutgiltiga_reformkommuner_fob60.out", clear
qui: gen n=.
qui: bysort kommun60: replace n=[_n]
keep if n==1
qui: drop n
rename kommun60 reformKommun60

* Merge the municipality reform dataset with the main dataset
merge 1:m reformKommun60 using EduIA.dta, keep(match using) nogen
qui: gen n=.
qui: bysort LopNr: replace n=[_n]
keep if n==1
qui: drop n

* Generate reform treatment indicator and cleanup 
* Determine whether exposed to education refrom by inhabited municipality & age at the time (whether a person is young enough to be in the reform cohorts)
gen treatment =.
replace treatment = 1 if BirthYear >= firstcohort60 & BirthYear!=. & firstcohort60!=.
replace treatment = 0 if BirthYear < firstcohort60 & firstcohort60 !=.

* Drop unclear municipalities/parishes:
replace treatment =. if Fors60 == 18017	/*hägersten*/
replace treatment =. if Fors60 == 18018	/*brännkyrka*/
replace treatment =. if Fors60 == 18019	/*vantör*/
replace treatment =. if Fors60 == 18020	/*enskede*/
replace treatment =. if Fors60 == 18021	/*skarpnäck*/
replace treatment =. if Fors60 == 18022	/*farsta*/

replace treatment =. if reformKommun60 == 281	/*södertälje*/
replace treatment =. if reformKommun60 == 283	/*sundbyberg*/
replace treatment =. if reformKommun60 == 580	/*linköping*/
replace treatment =. if reformKommun60 == 680	/*jönköping*/

replace treatment =. if Fors60 == 128007	/*limhamn*/
replace treatment =. if reformKommun60 == 1283	/*hälsingborg*/

replace treatment =. if Fors60 == 148016	/*örgryte*/
replace treatment =. if Fors60 == 148017	/*lundby*/
replace treatment =. if Fors60 == 148019	/*brämaregården*/
replace treatment =. if Fors60 == 148003	/*gamlestads=st pauli*/
replace treatment =. if Fors60 == 148006	/*härlanda*/

replace treatment =. if reformKommun60 == 2482		/*skellefteå*/
		
* Label key variables
label variable treatment "Reform treatment"
label variable firstcohort60 "Reform year cohort"
label variable reformKommun60 "Inhabited municipality used for reform treatment"

save EduIA.dta, replace

end

reform_indicator




** 4.2 Sample restriction
//The sample_restriction function filters the sample with several necessary conditions, and generate sample indicators based on different reform window widths
capture program drop define_sample
program define define_sample

capture drop precohort 
capture drop window 
capture drop sample`1'

* Was the individual born the year right before the reform cohort? Exclude individuals who are more likely to be subject to self-selection into the reform cohort
gen precohort = firstcohort60 - BirthYear
replace precohort = 0 if precohort != 1

* Windows around reform: an individual's own birth year compared to the reform year cohort in the corresponding municipality
gen window = BirthYear - firstcohort60

* Define sample: 
/* 
1) exposure to the reform should not be too far from the respective first reform cohort to address concerns regarding the comparability between the treatment and control groups over this long time period,
further away from the reform start year, reform effects might be more likely to be conflated with other changes.
Here I set time frames at 10-year to 6-year, i.e., sample restricted to individuals exposed to the reform 10 to 6 years before or after their municipalities' first reform cohort.
2) exclude individuals who were one year older than the first reform cohort, as they might be more likely to subject to self selection
3) individual's birth year should fall within the reform affected cohort range [1943, 1955]
4) municipality's first reform cohort should fall within the general reform affected cohort range [1943, 1955]
*/

gen sample`1' = 0
local frame = `1'
replace sample`1' = 1 if inrange(window, -`frame', `frame') & precohort != 1 & inrange(firstcohort60, 1943, 1955) & inrange(BirthYear, 1943, 1955)  

save EduIA.dta, replace
	
end

define_sample 10 
define_sample 9
define_sample 8
define_sample 7
define_sample 6



** 4.3 Dataset for municipality sociopolitical characteritics 
//The resultant dataset, Kommun.dta, contains Swedish general election related variables, covering elections in 1940, 1944, 1948, 1952, 1956, 1958, 1960, 1964

* Merge municipalities and their first reform cohorts (using Holmlund (2007)) with municipality variables
use "D:\Data\ExtData\ValResData\ValResultat1911_2006.dta", clear //Based on Lindgren et al.(2019). Original data from the Election Data Archive (Valdataarkivet) 1948-1970 and the Swedish Electoral Data (Valdata) 1911-1944, from the Swedish National Data Service (SND)
destring Kom60, replace
rename Kom60 kommun60
merge m:1 kommun60 using "D:\Data\ExtData\SkolData\slutgiltiga_reformkommuner_fob60.dta" //Based on Holmlund (2007)

drop if _merge==2
drop _merge
keep kommun60 Ar firstcohort60 RostBerKom60 ValdeltKom60 AndelKom60_m AndelKom60_fp AndelKom60_c AndelKom60_s AndelKom60_v //keep population 
keep if Ar >= 1940 & Ar < = 1964 & Ar !=.
bysort kommun60 Ar: gen n = _n
keep if n == 1
drop n

rename RostBerKom60 RostBer //Population eligible to vote
rename ValdeltKom60 Valdelt //Turnout
rename AndelKom60_m Andel_m //Moderate party vote share (högern/moderaterna)
rename AndelKom60_fp Andel_fp //Liberal party vote share (liberaler/frisinnade/folkpartiet)
rename AndelKom60_c Andel_c //Central party vote share (bondeförbundet/centern)
rename AndelKom60_s Andel_s //Social democrats party vote share (socialdemokrater)
rename AndelKom60_v Andel_v //Left party vote share (kommunistiska partier/vänsterpartiet)
rename kommun60 reformKommun60 //Municipality according to 1960 census

label variable RostBer "Number of eligible voters"
label variable Valdelt "Turnout"
label variable Andel_m "Share of votes for the moderate parties"
label variable Andel_fp "Share of votes for the liberal parties"
label variable Andel_c "Share of votes for the central parties"
label variable Andel_s "Share of votes for the social democrats"
label variable Andel_v "Share of votes for the left parties"
label variable Ar "Year"
label variable firstcohort60 "First reform cohort (birth year)"
label variable reformKommun60 "Municipality according to 1960 census"

save Kommun.dta, replace

* Calculate pre-reform municipality variables (1940-1948): means and SDs of available municipality characteristics during 1940-1948 (elections in 1940, 1944, 1948)
capture program drop kommun_trend
program define kommun_trend

* Mean
use Kommun.dta
keep if Ar >= 1940 & Ar <= 1948
drop firstcohort60

if `1' == RostBer {
    replace RostBer = log10(RostBer) //logarithmise eligible voter population 
}

   collapse (mean) avg_`1' = `1', by(reformKommun60)
   merge 1:m reformKommun60 using EduIA.dta, keep(match using) nogen
   save EduIA.dta, replace

* SD
use Kommun.dta
keep if Ar >= 1940 & Ar <= 1948
drop firstcohort60

if `1' == RostBer {
    replace RostBer = log10(RostBer) //logarithmise eligible voter population 
}

    collapse (sd) sd_`1' = `1', by(reformKommun60)
    merge 1:m reformKommun60 using EduIA.dta, keep(match using) nogen
    save EduIA.dta, replace
end


kommun_trend RostBer
kommun_trend Valdelt
kommun_trend Andel_m
kommun_trend Andel_fp
kommun_trend Andel_c
kommun_trend Andel_s
kommun_trend Andel_v




********************************************
** 5. PGI and other variables for Study 3 **
********************************************

** 5.1 PGI, first 20 principal components, and batch indicator
use D:\STR_RiskScores\Stata_new\PGS\repository_1
keep LopNr PGI_EA_single batch PC1-PC20 

* Standardize EA PGI by batch
qui: gen n=.
qui: bysort LopNr: replace n=[_n]
qui: keep if n==1
qui: drop n
qui: egen PGI_EA_single_mean = mean(PGI_EA_single), by(batch)
qui: egen PGI_EA_single_sd = sd(PGI_EA_single), by(batch)
qui: gen stdPGI_EA_single = (PGI_EA_single - PGI_EA_single_mean) / PGI_EA_single_sd
qui: drop PGI_EA_single_mean PGI_EA_single_sd

merge 1:1 LopNr using EduIA.dta, keep(match using) nogen
save EduIA.dta, replace

rename stdPGI_EA_single PGI_EA
label variable PGI_EA "EA PGI"

save EduIA.dta, replace




** 5.2 Locus of control
* Get items on locus of control from SALTY as a potential mediator 
forvalues i=1(1)12 {
	use "D:\STR_RiskScores\Stata_new\STRSalty\slty_map.dta"
	keep LopNr PERSONLIGHET2_`i'
	gen n=.
	bysort LopNr: replace n=[_n]
	keep if n==1
	drop n
	merge 1:1 LopNr using EduIA.dta, keep (match using) nogen
	replace PERSONLIGHET2_`i' = . if PERSONLIGHET2_`i'==9
	rename PERSONLIGHET2_`i' tp`i'
	save EduIA.dta, replace 
}
gen LOC_raw = tp1-tp2-tp3-tp4+tp5+tp6+tp7-tp8-tp9-tp10-tp11-tp12
sum LOC_raw
qui: egen LOC_mean = mean(LOC_raw) 
qui: egen LOC_sd = sd(LOC_raw)
gen LOC = (LOC_raw-LOC_mean)/LOC_sd
drop tp* LOC_raw
label variable LOC "Locus of control"
save EduIA.dta, replace



clear

